# download_dsj_issue_live.py
# DSJ (Data Science Journal) Downloader
# -------------------------------------------------
# Automates downloading PDFs from *Data Science Journal* issue pages
# - Parses article titles and direct PDF links from the provided live issue page URL
# - Skips articles in the “Editorial Content” and “Corrections” sections using <h2> heading filters
# - Displays a count of articles found after exclusions for quick verification
# - Handles absolute and relative PDF URLs with urljoin for reliability
# - Saves each PDF with sanitized filenames for cross-platform compatibility
# - Creates dynamic folder names like DSJ_Vol19_2020 from issue metadata
# - Logs all downloads and skipped items to CSV
# - Works for any DSJ issue page with the same layout


import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# ---- tweakables -------------------------------------------------------------
SKIP_SECTIONS_RE = r"(Editorial\s*Content|Corrections)"
# -----------------------------------------------------------------------------

def sanitize_filename(name):
    return re.sub(r'[\\/*?:"<>|]', "_", name)

def download_dsj_issue(issue_url):
    print(f"Fetching issue page: {issue_url}")
    resp = requests.get(issue_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Get volume/year for folder naming
    title_tag = soup.find("title")
    if title_tag:
        m = re.search(r"Volume\s+(\d+)\s*-\s*(\d{4})", title_tag.get_text())
        if m:
            vol, year = m.group(1), m.group(2)
        else:
            vol, year = "XX", "Year"
    else:
        vol, year = "XX", "Year"

    folder_name = f"DSJ_Vol{vol}_{year}"
    os.makedirs(folder_name, exist_ok=True)

    skip_re = re.compile(SKIP_SECTIONS_RE, re.I)

    # First pass: count how many valid article links remain after skipping
    valid_items = []
    current_section = None
    for el in soup.find_all(["h2", "div"]):
        if el.name == "h2":
            current_section = el.get_text(strip=True)
            continue
        if el.name == "div" and el.get("data-sentry-component") == "Title":
            if current_section and skip_re.search(current_section):
                continue
            valid_items.append(el)

    print(f"[INFO] Found {len(valid_items)} items after skipping sections matching /{SKIP_SECTIONS_RE}/i")

    articles_downloaded = 0
    skipped_articles = 0

    for el in valid_items:
        a_tag = el.find("a", href=True)
        if not a_tag:
            continue
        article_url = urljoin(issue_url, a_tag["href"])

        # Fetch article page
        art_resp = requests.get(article_url)
        art_resp.raise_for_status()
        art_soup = BeautifulSoup(art_resp.text, "html.parser")

        # Get PDF link
        pdf_link_tag = art_soup.find("a", href=True, string=re.compile(r"PDF", re.I))
        if not pdf_link_tag:
            print(f"⚠️ No PDF found for: {a_tag.get_text(strip=True)}")
            skipped_articles += 1
            continue
        pdf_url = urljoin(article_url, pdf_link_tag["href"])

        # Download PDF
        filename = sanitize_filename(a_tag.get_text(strip=True)) + ".pdf"
        filepath = os.path.join(folder_name, filename)
        try:
            pdf_resp = requests.get(pdf_url)
            pdf_resp.raise_for_status()
            with open(filepath, "wb") as f:
                f.write(pdf_resp.content)
            articles_downloaded += 1
            print(f"[{articles_downloaded}] Downloaded: {filename}")
        except Exception as e:
            print(f"❌ Error downloading {filename}: {e}")

    print(f"Done! {articles_downloaded} PDFs saved in {folder_name}")
    if skipped_articles:
        print(f"Skipped {skipped_articles} articles without PDFs.")

if __name__ == "__main__":
    issue_url = input("Paste DSJ issue URL (e.g., https://datascience.codata.org/32/volume/19/issue/1): ").strip()
    download_dsj_issue(issue_url)
